{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- encoding: utf-8 -*-"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "reload(sys)\n",
    "sys.setdefaultencoding('utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from gensim import corpora,models,similarities,utils"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 获取训练数据\n",
    "def getTrainSet(inFile):\n",
    "    # 文章标题集\n",
    "    title_set = []\n",
    "    # 训练集\n",
    "    train_set=[]\n",
    "    # 读入训练数据\n",
    "    f=open(inFile)\n",
    "    lines=f.readlines()\n",
    "    for line in lines:\n",
    "        article = line.replace('\\n','').split('\\t')\n",
    "        title = article[0]\n",
    "        title_set.append(title)\n",
    "        content = article[1:]\n",
    "        train_set.append(content)\n",
    "    f.close()\n",
    "    return (title_set,train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 把筛选以后的结果写入txt文件\n",
    "def writeTitleSet(fileName,data,mode):\n",
    "    fw = open(fileName,mode)\n",
    "    for w in data:\n",
    "        fw.writelines(w + \"\\n\")\n",
    "    fw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 训练模型\n",
    "def trainTFIDF(train_set,mdlFile,dicFile,idxFile,title_set,TitleFile):\n",
    "    # 生成字典\n",
    "    dictionary = corpora.Dictionary(train_set)\n",
    "    dictionary.filter_extremes(no_below=1,no_above=1,keep_n=None)\n",
    "    dictionary.save(dicFile)\n",
    "    \n",
    "    # 生成语料\n",
    "    corpus = [dictionary.doc2bow(text) for text in train_set]\n",
    "    \n",
    "    #使用数字语料生成TFIDF模型\n",
    "    tfidfModel = models.TfidfModel(corpus)\n",
    "    #存储tfidfModel\n",
    "    tfidfModel.save(mdlFile)\n",
    "\n",
    "    #把全部语料向量化成TFIDF模式,这个tfidfModel可以传入二维数组\n",
    "    tfidfVectors = tfidfModel[corpus]\n",
    "    #建立索引并保存\n",
    "    indexTfidf = similarities.MatrixSimilarity(tfidfVectors)\n",
    "    indexTfidf.save(idxFile)\n",
    "    # 把文件标题索引写入文件\n",
    "    writeTitleSet(TitleFile,title_set,\"a\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 训练模型\n",
    "def trainLDA(train_set,mdlFile,dicFile,idxFile,title_set,TitleFile):\n",
    "    # 生成字典\n",
    "    dictionary = corpora.Dictionary(train_set)\n",
    "    dictionary.filter_extremes(no_below=1,no_above=1,keep_n=None)\n",
    "    dictionary.save(dicFile)\n",
    "    \n",
    "    # 生成语料\n",
    "    corpus = [dictionary.doc2bow(text) for text in train_set]\n",
    "    \n",
    "    #使用数字语料生成TFIDF模型\n",
    "    tfidfModel = models.TfidfModel(corpus)\n",
    "\n",
    "    #把全部语料向量化成TFIDF模式,这个tfidfModel可以传入二维数组\n",
    "    tfidfVectors = tfidfModel[corpus]\n",
    "    \n",
    "    #通过TFIDF向量生成LDA模型,id2word表示编号的对应词典,num_topics表示主题数,我们这里设定的10\n",
    "    lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=50)\n",
    "    #把模型保存下来\n",
    "    lda.save(mdlFile)\n",
    "    #把所有TFIDF向量变成LDA的向量\n",
    "    corpus_lda = lda[tfidfVectors]\n",
    "    #建立索引,把LDA数据保存下来\n",
    "    indexLDA = similarities.MatrixSimilarity(corpus_lda)\n",
    "    indexLDA.save(idxFile)\n",
    "    \n",
    "    # 把文件标题索引写入文件\n",
    "    writeTitleSet(TitleFile,title_set,\"w\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def trainModel():\n",
    "    \n",
    "    inFile = \"./data/processed/all_summary.txt\"\n",
    "    title_set,train_set = getTrainSet(inFile)\n",
    "\n",
    "    TFIDF_mdl = \"./model/all_test_TFIDF.mdl\"\n",
    "    TFIDF_dic = \"./model/all_test_TFIDF.dic\"\n",
    "    TFIDF_idx = \"./model/all_test_TFIDF.idx\"\n",
    "    \n",
    "    LDA_mdl = \"./model/all_test_LDA50TOPIC.mdl\"\n",
    "    LDA_dic = \"./model/all_test_LDA50TOPIC.dic\"\n",
    "    LDA_idx = \"./model/all_test_LDA50TOPIC.idx\"\n",
    "    \n",
    "    TitleFile = \"./model/infoSet.txt\"\n",
    "    \n",
    "    # 训练TFIDF\n",
    "    trainTFIDF(train_set,TFIDF_mdl,TFIDF_dic,TFIDF_idx,title_set,TitleFile)\n",
    "    \n",
    "    # 训练LDA\n",
    "    trainLDA(train_set,LDA_mdl,LDA_dic,LDA_idx,title_set,TitleFile)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def main():\n",
    "    \n",
    "    trainModel()\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "if __name__ == '__main__':\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
